import requests
from bs4 import BeautifulSoup
import pandas
import re
import time
import string


input_file = "LINKS_sputnik_rs_vaccine.csv"

output_file = "DATA_sputnik_rs_vaccine.csv"

# Not relevant to this scrape as no further filtering is needed
include_one_of_these_terms = []
include_all_terms = []

check_old = "yes"

# Sputnik domain
domain = "https://rs-lat.sputniknews.com"




df = pandas.read_csv(input_file)

url_list = df["url"].tolist()

print("This is the number of articles to scrape: " + str(len(url_list)))

l = []

for article in url_list:
    print("This is how many we have scraped so far: " + str(url_list.index(article) + 1) + " out of " + str(len(url_list)))
    # It is unfortunate to add this as it makes it take longer, but it's necessary if the dataset is super big and might otherwise overload the server
    time.sleep(0.5)
    if "keyword" in article or "person" in article or "event" in article or "organization" in article:
        continue
    else:
        pass
    d = {}
    request = requests.get(article)
    content = request.content
    soup = BeautifulSoup(content, "html.parser")
    exclist = string.punctuation + "–" + "—" + "" + "„" + "”" + "“"
    try:
        #here's punctuation exclusion list so I can later remove all punctuation in the title and text
        #Got the code for punctuation removal here: https://towardsdatascience.com/how-to-efficiently-remove-punctuations-from-a-string-899ad4a059fb
        #seems like it did not include the following dash — so I have to add that too
        #doing body first so can check if the term I want is actually in here
        list_text = []
        first_para = soup.find("div", {"class": "article__announce-text"}).text.strip()
        list_text.append(first_para)
        #the replacements here are just to address where the p sections mushed text together with no space. Hopefully this covers all instances of ends of possible ends of paragraphs
        #This does create extra white space, which can be removed with split and then rejoining the text (by appending to list_text)
        body = soup.find("div", {"class": "article__body"})
        #If just get the text from the article__body gets the extra "ad" articles (other sputnik articles)
        paras = body.find_all("p")
        for para in paras:
            for word in para.text.replace("\n", "").replace(";", " ").replace(".", " ").replace(":", " ").replace(",", " ").strip().split():
                list_text.append(word)
        #apparently some articles do not use the p structure for some reason so have to include ones like this too
        article__text_blocks = body.find_all("div", {"class": "article__text"})
        for block in article__text_blocks:
            for word in block.text.replace("\n", "").replace(";", " ").replace(".", " ").replace(":", " ").replace(",", " ").strip().split():
                list_text.append(word)
        #It just gets more fun: here's how it is formatted for photo essays
        photo_essay_captions = body.find_all("div", {"class": "article__photo-item-text"})
        for caption in photo_essay_captions:
            for word in caption.text.replace("\n", "").replace(";", " ").replace(".", " ").replace(":", " ").replace(",", " ").strip().split():
                list_text.append(word)
        full_text_not_clean = " ".join(list_text)
        full_text = "".join(x for x in full_text_not_clean if x not in exclist).lower()

        # This starts by finding out if the text has one of the either/or 
        if include_one_of_these_terms == []:
            has_or_term = "yes"
        else:
            # Starting with assumption that none of the words are in the text
            has_or_term = "no"
            for or_term in include_one_of_these_terms:
                if re.search(or_term.lower(), full_text):
                    # If one of the or_terms is in the text, then we change has_or_term to yes
                    has_or_term = "yes"
        if has_or_term == "no":
            # this goes back to the top, not adding this article to the list
            continue
        # Now we check if the article has all the and terms
        else:
            # start with the assumption that it does have all terms
            include = "yes"
            # If we aren't doing any filtering, we leave include as "yes"
            if include_all_terms == []:
                pass
            else:
                for and_term in include_all_terms:
                    if re.search(and_term.lower(), full_text):
                        # if it has the term, we keep going and check if it has the next one, leaving include as yes
                        pass
                    else:
                        # If at any point it doesn't have the term, include changes to no
                        include = "no"
        if include == "no":
            # We stop scraping if the conditions aren't met
            continue
        else:
            pass
        d["full_text"] = "".join(x for x in full_text_not_clean if x not in exclist).lower()
        title_not_clean = soup.find("h1", {"class": "article__title"}).text
        d["title"] = "".join(x for x in title_not_clean if x not in exclist)
        #Came accross occasional article without a date
        try:
            d["date"] = soup.find("div", {"class": "article__info-date"}).text.split(" ")[1].replace(".01.", " JAN ").replace(".02.", " FEB ").replace(".03.", " MAR ").replace(".04.", " APR ").replace(".05.", " MAY ").replace(".06.", " JUN ").replace(".07.", " JUL ").replace(".08.", " AUG ").replace(".09.", " SEP ").replace(".10.", " OCT ").replace(".11.", " NOV ").replace(".12.", " DEC ")
        except:
            d["date"] = "NA"
        #Adding a try here because sputnik Poland doesn't have views
        try:
            d["views"] = soup.find("div", {"class": "views__count"}).text
        except:
            d["views"] = "NA"
        tag_list = []
        tags = soup.find_all("a", {"class": "tag__text"})
        for tag in tags:
            tag_list.append(tag.text)
        d["tag_list"] = tag_list
        d["url"] = article
        l.append(d)
    except AttributeError:
        #I checked and all the articles it failed on were not really articles, but rather topics, keywords, events, etc
        pass
    # So on occassion it is an old format article that you need to click the link of at the top to get to the new format.
    # Since the new format articles don't show up in the google search it appears, I'll get the url and then scrape the resulting article
    # IMPORTANT: This does make it take longer to run. Recommend that if over 100 articles there will be enough data otherwise and it isn't worth it; comment out
    if check_old.lower() == "yes":
        try:
            new_url = soup.find("a", {"class": "list__title"})["href"]
            request = requests.get("https://ru.sputniknews.md" + new_url) #Don't forget to change this part to reflect the right country!
            content = request.content
            soup = BeautifulSoup(content, "html.parser")
            #here's punctuation exclusion list so I can later remove all punctuation in the title and text
            #Got the code for punctuation removal here: https://towardsdatascience.com/how-to-efficiently-remove-punctuations-from-a-string-899ad4a059fb
            #seems like it did not include the following dash — so I have to add that too
            #doing body first so can check if the term I want is actually in here
            list_text = []
            first_para = soup.find("div", {"class": "article__announce-text"}).text.strip()
            list_text.append(first_para)
            #the replacements here are just to address where the p sections mushed text together with no space. Hopefully this covers all instances of ends of possible ends of paragraphs
            #This does create extra white space, which can be removed with split and then rejoining the text (by appending to list_text)
            body = soup.find("div", {"class": "article__body"})
            #If just get the text from the article__body gets the extra "ad" articles (other sputnik articles)
            paras = body.find_all("p")
            for para in paras:
                for word in para.text.replace("\n", "").replace(";", " ").replace(".", " ").replace(":", " ").replace(",", " ").strip().split():
                    list_text.append(word)
            #apparently some articles do not use the p structure for some reason so have to include ones like this too
            article__text_blocks = body.find_all("div", {"class": "article__text"})
            for block in article__text_blocks:
                for word in block.text.replace("\n", "").replace(";", " ").replace(".", " ").replace(":", " ").replace(",", " ").strip().split():
                    list_text.append(word)
            #It just gets more fun: here's how it is formatted for photo essays
            photo_essay_captions = body.find_all("div", {"class": "article__photo-item-text"})
            for caption in photo_essay_captions:
                for word in caption.text.replace("\n", "").replace(";", " ").replace(".", " ").replace(":", " ").replace(",", " ").strip().split():
                    list_text.append(word)
            full_text_not_clean = " ".join(list_text)
            full_text = "".join(x for x in full_text_not_clean if x not in exclist).replace("  ", "").lower()
            
            # This starts by finding out if the text has one of the either/or 
            if include_one_of_these_terms == []:
                has_or_term = "yes"
            else:
                # Starting with assumption that none of the words are in the text
                has_or_term = "no"
                for or_term in include_one_of_these_terms:
                    if re.search(or_term.lower(), full_text):
                        # If one of the or_terms is in the text, then we change has_or_term to yes
                        has_or_term = "yes"
            if has_or_term == "no":
                # this goes back to the top, not adding this article to the list
                continue
            # Now we check if the article has all the and terms
            else:
                # start with the assumption that it does have all terms
                include = "yes"
                # If we aren't doing any filtering, we leave include as "yes"
                if include_all_terms == []:
                    pass
                else:
                    for and_term in include_all_terms:
                        if re.search(and_term.lower(), full_text):
                            # if it has the term, we keep going and check if it has the next one, leaving include as yes
                            pass
                        else:
                            # If at any point it doesn't have the term, include changes to no
                            include = "no"
            if include == "no":
                # We stop scraping if the conditions aren't met
                continue
            else:
                pass
            d["full_text"] = "".join(x for x in full_text_not_clean if x not in exclist).lower()
            title_not_clean = soup.find("h1", {"class": "article__title"}).text
            d["title"] = "".join(x for x in title_not_clean if x not in exclist)
            d["date"] = soup.find("div", {"class": "article__info-date"}).text.split(" ")[1].replace(".01.", " JAN ").replace(".02.", " FEB ").replace(".03.", " MAR ").replace(".04.", " APR ").replace(".05.", " MAY ").replace(".06.", " JUN ").replace(".07.", " JUL ").replace(".08.", " AUG ").replace(".09.", " SEP ").replace(".10.", " OCT ").replace(".11.", " NOV ").replace(".12.", " DEC ")
            d["views"] = soup.find("div", {"class": "views__count"}).text
            tag_list = []
            tags = soup.find_all("a", {"class": "tag__text"})
            for tag in tags:
                tag_list.append(tag.text)
            d["tag_list"] = tag_list
            d["url"] = article
            l.append(d)
        except:
            pass



df = pandas.DataFrame(l)
df.to_csv(output_file, encoding = "utf-8-sig")



